import pandas as pd
pd.options.mode.chained_assignment = 'warn' # default='warn'
import numpy as np
import itertools
import scipy
import plotly.express as px
import plotly.graph_objects as go
# plotly version 5.2.1
import plotly.io as pio
pio.renderers.default = "notebook"
reviews = pd.read_csv('winereviews2.csv')
reviews.head()
| Unnamed: 0 | country | description | designation | points | price | province | region_1 | region_2 | taster_name | taster_twitter_handle | title | variety | winery | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Italy | Aromas include tropical fruit, broom, brimston... | Vulkà Bianco | 87 | NaN | Sicily & Sardinia | Etna | NaN | Kerin O’Keefe | @kerinokeefe | Nicosia 2013 Vulkà Bianco (Etna) | White Blend | Nicosia |
| 1 | 1 | Portugal | This is ripe and fruity, a wine that is smooth... | Avidagos | 87 | 15.0 | Douro | NaN | NaN | Roger Voss | @vossroger | Quinta dos Avidagos 2011 Avidagos Red (Douro) | Portuguese Red | Quinta dos Avidagos |
| 2 | 2 | US | Tart and snappy, the flavors of lime flesh and... | NaN | 87 | 14.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Rainstorm 2013 Pinot Gris (Willamette Valley) | Pinot Gris | Rainstorm |
| 3 | 3 | US | Pineapple rind, lemon pith and orange blossom ... | Reserve Late Harvest | 87 | 13.0 | Michigan | Lake Michigan Shore | NaN | Alexander Peartree | NaN | St. Julian 2013 Reserve Late Harvest Riesling ... | Riesling | St. Julian |
| 4 | 4 | US | Much like the regular bottling from 2012, this... | Vintner's Reserve Wild Child Block | 87 | 65.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Sweet Cheeks 2012 Vintner's Reserve Wild Child... | Pinot Noir | Sweet Cheeks |
np.sum(reviews['price'].isnull())
total = 8996
Solution: Drop (price is going to be a key observation for this dataset) considered interpolate
but standard deviation of price column is very high, with more time could interpolate better
np.sum(reviews['prvoince'].isnull())
total = 63
Solution: ignore (secondary information)
np.sum(reviews['taster_name'].isnull())
total = 26244
Solution: Drop (untrustworthy)
np.sum(reviews['variety'].isnull())
total = 1
Investigate: reviews[reviews.variety.isnull()] would be dropped anyway, taster_name = NaN
reviews[reviews.variety.isnull()]
| Unnamed: 0 | country | description | designation | points | price | province | region_1 | region_2 | taster_name | taster_twitter_handle | title | variety | winery | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 86909 | 86909 | Chile | A chalky, dusty mouthfeel nicely balances this... | NaN | 88 | 17.0 | Maipo Valley | NaN | NaN | NaN | NaN | Carmen 1999 (Maipo Valley) | NaN | Carmen |
# drop all rows with price = null
reviews = reviews[reviews['price'].notna()]
reviews
#120975 rows after filter. 8996 rows labeled na, 129971 unfiltered, checks out
reviewsClean = reviews[reviews['taster_name'].notna()]
reviewsClean.shape
(96479, 14)
reviews.points.describe() reviews.price.describe() reviews.price.describe()
count 120975.000000 mean 35.363389 std 41.022218 min 4.000000 25% 17.000000 50% 25.000000 75% 42.000000 max 3300.000000 Name: price, dtype: float64
Big jump after the 75% quantile, lets look at whats above that. 25 to 42 could be a good bucket
reviewsClean.loc[reviewsClean['price'] > 42].describe()
| Unnamed: 0 | points | price | |
|---|---|---|---|
| count | 22845.000000 | 22845.000000 | 22845.000000 |
| mean | 65695.210418 | 91.070344 | 76.115167 |
| std | 37707.806137 | 2.570116 | 73.180289 |
| min | 4.000000 | 81.000000 | 43.000000 |
| 25% | 33397.000000 | 90.000000 | 50.000000 |
| 50% | 66008.000000 | 91.000000 | 60.000000 |
| 75% | 98462.000000 | 93.000000 | 79.000000 |
| max | 129967.000000 | 100.000000 | 3300.000000 |
Conclusion: Winespectator primarily reviews moderately priced wines, but seems to be a good distribution above that, with nearly 23,000 reviews above the 75% quantile. Wine price between 42 to 79 could be a good bucket.
Let's do some basic relationship analysis. start with price v points
fig = px.scatter(x=reviewsClean['price'], y=reviewsClean['points'],
labels = {
'x' : 'price($)',
'y' : 'score'
},
color_discrete_sequence=px.colors.qualitative.Dark24,
title = 'Price of wines versus Score'
)
fig
Lots of price outliers, lets look at the bulk of our data with price less than 1000
cheaperWinesDF = reviewsClean.loc[reviewsClean['price'] < 1000]
fig = px.scatter(x=cheaperWinesDF['price'], y=cheaperWinesDF['points'],
trendline="ols",
trendline_options = dict(log_x = True),
labels = {
'x' : 'price($)',
'y' : 'score'
},
color_discrete_sequence=px.colors.qualitative.Dark24,
title = 'Price of wines less than $1000 versus Score'
)
fig
Not a bad logrithmic fit at first glance
Lets look at the 5 most popular regions and see if any other trends emerge
Looking at value_counts() gives us the most popular regions as: US, France, Italy, Spain, Portugal, Chile.
Lets filter our DF
popCountryFilter = ['US', 'France', 'Italy', 'Spain', 'Chile']
popCountryDF = cheaperWinesDF.loc[cheaperWinesDF['country'].isin(popCountryFilter)]
fig = px.scatter(popCountryDF, x='price', y='points', color = "country",
trendline="ols",
trendline_options = dict(log_x = True),
labels = {
'price' : 'price($)',
'points' : 'score'
},
title = 'Price of wines less than $1000 versus Score of the 5 most Reviewed Countries'
)
fig
Interesting...the trendline for italy is significantly different from the other 4 countries let's group by these 5 countries and look at the stats
popCountryDF.groupby('country')['points', 'price'].describe()
C:\Users\jmeis\AppData\Local\Temp/ipykernel_16176/1681514197.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
| points | price | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| country | ||||||||||||||||
| Chile | 4305.0 | 86.514286 | 2.701515 | 80.0 | 85.0 | 86.0 | 88.0 | 95.0 | 4305.0 | 20.869686 | 22.074108 | 5.0 | 12.0 | 15.0 | 21.0 | 400.0 |
| France | 17512.0 | 88.746231 | 3.008395 | 80.0 | 87.0 | 88.0 | 91.0 | 100.0 | 17512.0 | 39.912003 | 54.504929 | 5.0 | 16.0 | 25.0 | 43.0 | 973.0 |
| Italy | 10121.0 | 88.885090 | 2.508983 | 80.0 | 87.0 | 89.0 | 90.0 | 100.0 | 10121.0 | 41.607450 | 38.452822 | 6.0 | 20.0 | 30.0 | 50.0 | 800.0 |
| Spain | 6509.0 | 87.315256 | 3.066390 | 80.0 | 85.0 | 87.0 | 89.0 | 98.0 | 6509.0 | 28.343063 | 34.794963 | 4.0 | 13.0 | 18.0 | 30.0 | 770.0 |
| US | 37510.0 | 88.980085 | 2.835977 | 80.0 | 87.0 | 89.0 | 91.0 | 100.0 | 37510.0 | 36.135750 | 24.030075 | 4.0 | 20.0 | 30.0 | 45.0 | 750.0 |
Nothing is immediately obvious, Itallian wines have a slight bias towards being more expensive than the other countries
Hypothesis: Italian wines have lower value(cost/score ratio) than the other 4 sampled countries. We will dive into this later.
Let's breakup wine price into buckets. No general consensus seems to exist in the wine industry as to what price buckets would be apporpriate, so let's use the data itself by quantile:
# Create Buckets
cut_labels = ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive', 'Outlandish']
cut_bins = [4, 17, 25, 42, 79, 175, 3300]
# start with fresh DF
reviewsCleanBuckets = reviewsClean
# create series for new row
price_buckets = pd.cut(reviewsCleanBuckets['price'], bins=cut_bins, labels=cut_labels)
# concat to avoid assigning into a copy of a DF
reviewsCleanBuckets = pd.concat([reviewsCleanBuckets, price_buckets], axis = 1)
# Rename columns
reviewsCleanBuckets.columns = ['Unnamed: 0', 'country', 'description', 'designation', 'points',
'price', 'province', 'region_1', 'region_2', 'taster_name',
'taster_twitter_handle', 'title', 'variety', 'winery', 'price_bucket']
#drop unneeded column
reviewsCleanBuckets.drop('Unnamed: 0', axis = 1 )
| country | description | designation | points | price | province | region_1 | region_2 | taster_name | taster_twitter_handle | title | variety | winery | price_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Portugal | This is ripe and fruity, a wine that is smooth... | Avidagos | 87 | 15.0 | Douro | NaN | NaN | Roger Voss | @vossroger | Quinta dos Avidagos 2011 Avidagos Red (Douro) | Portuguese Red | Quinta dos Avidagos | Cheap |
| 2 | US | Tart and snappy, the flavors of lime flesh and... | NaN | 87 | 14.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Rainstorm 2013 Pinot Gris (Willamette Valley) | Pinot Gris | Rainstorm | Cheap |
| 3 | US | Pineapple rind, lemon pith and orange blossom ... | Reserve Late Harvest | 87 | 13.0 | Michigan | Lake Michigan Shore | NaN | Alexander Peartree | NaN | St. Julian 2013 Reserve Late Harvest Riesling ... | Riesling | St. Julian | Cheap |
| 4 | US | Much like the regular bottling from 2012, this... | Vintner's Reserve Wild Child Block | 87 | 65.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Sweet Cheeks 2012 Vintner's Reserve Wild Child... | Pinot Noir | Sweet Cheeks | Pricey |
| 5 | Spain | Blackberry and raspberry aromas show a typical... | Ars In Vitro | 87 | 15.0 | Northern Spain | Navarra | NaN | Michael Schachner | @wineschach | Tandem 2011 Ars In Vitro Tempranillo-Merlot (N... | Tempranillo-Merlot | Tandem | Cheap |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 129966 | Germany | Notes of honeysuckle and cantaloupe sweeten th... | Brauneberger Juffer-Sonnenuhr Spätlese | 90 | 28.0 | Mosel | NaN | NaN | Anna Lee C. Iijima | NaN | Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ... | Riesling | Dr. H. Thanisch (Erben Müller-Burggraef) | Moderate |
| 129967 | US | Citation is given as much as a decade of bottl... | NaN | 90 | 75.0 | Oregon | Oregon | Oregon Other | Paul Gregutt | @paulgwine | Citation 2004 Pinot Noir (Oregon) | Pinot Noir | Citation | Pricey |
| 129968 | France | Well-drained gravel soil gives this wine its c... | Kritt | 90 | 30.0 | Alsace | Alsace | NaN | Roger Voss | @vossroger | Domaine Gresser 2013 Kritt Gewurztraminer (Als... | Gewürztraminer | Domaine Gresser | Moderate |
| 129969 | France | A dry style of Pinot Gris, this is crisp with ... | NaN | 90 | 32.0 | Alsace | Alsace | NaN | Roger Voss | @vossroger | Domaine Marcel Deiss 2012 Pinot Gris (Alsace) | Pinot Gris | Domaine Marcel Deiss | Moderate |
| 129970 | France | Big, rich and off-dry, this is powered by inte... | Lieu-dit Harth Cuvée Caroline | 90 | 21.0 | Alsace | Alsace | NaN | Roger Voss | @vossroger | Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car... | Gewürztraminer | Domaine Schoffit | Inexpensive |
96479 rows × 14 columns
Let's look at a boxplot of the price buckets to see how our data is distributed
fig = px.box(reviewsCleanBuckets, x="price_bucket", y="points",
category_orders = {
'price_bucket' : ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive', 'Outlandish']
},
labels = {
'price_bucket' : 'price($)',
'points' : 'rating'
},
title = 'Distribution of Rating vs Price Bucket'
)
fig.show()
Interesting...clear linear trend of rating versus price.
inner quantiles spread seems to be consistent except for the 'expensive' bucket, it is noticebly larger.
let's see if individual countries are scewing any of this data
popCountryFilter = ['US', 'France', 'Italy', 'Spain', 'Chile']
popReviewsCleanBuckets = reviewsCleanBuckets.loc[reviewsCleanBuckets['country'].isin(popCountryFilter)]
fig = px.box(popReviewsCleanBuckets, x="price_bucket", y="points", color = 'country',
category_orders = {
'price_bucket' : ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive', 'Outlandish']
},
labels = {
'price_bucket' : 'price($)',
'points' : 'rating'
},
title = 'Distribution of Rating vs Price Bucket by Country'
)
fig.show()
US wines seem to be the least conistent with the largest average spread over price.
Itallian wines get more inconsistant as price increases
Spanish wines are fairly consistant over all price values
french wines are fairly consistant over all price values but get more consistant in the outlandish category
Before we continue our quest for value in wine, we need to look for any obvious bias in the review scores
Let's look at some of the most prolific reviwers and their most reviewed regions, and see if they match the general statistics of the region as a whole
Find the top 10 reviwers
reviewsClean['taster_name'].value_counts()
Roger Voss 20172 Michael Schachner 14951 Kerin O’Keefe 9874 Virginie Boone 9507 Paul Gregutt 9498 Matt Kettmann 6237 Joe Czerwinski 5012 Sean P. Sullivan 4925 Anna Lee C. Iijima 4369 Jim Gordon 4171 Anne Krebiehl MW 3398 Lauren Buzzeo 1713 Susan Kostrzewa 1073 Mike DeSimone 504 Jeff Jenssen 491 Alexander Peartree 413 Carrie Dykes 138 Fiona Adams 27 Christina Pickard 6 Name: taster_name, dtype: int64
The top 10 reviewers are: Roger Voss, Michael Schachner, Kerin O’Keefe, Virginie Boone, Paul Gregutt, Matt Kettmann, Joe Czerwinski, Sean P. Sullivan, Anna Lee C. Iijima, Jim Gordon
topReviwersList = ['Roger Voss', 'Michael Schachner', 'Kerin O’Keefe, Virginie Boone', 'Paul Gregutt',
'Matt Kettmann', 'Joe Czerwinski', 'Sean P. Sullivan', 'Anna Lee C. Iijima', 'Jim Gordon']
#filter DF for these reviewers and top 5 countries
topReviewersDF = reviewsClean.loc[(reviewsClean['taster_name'].isin(topReviwersList)) &
(reviewsClean['country'].isin(popCountryFilter))]
# groupby reviewer and country and describe
topReviewersDF.groupby(['country', 'taster_name']).describe().points
| count | mean | std | min | 25% | 50% | 75% | max | ||
|---|---|---|---|---|---|---|---|---|---|
| country | taster_name | ||||||||
| Chile | Joe Czerwinski | 23.0 | 85.043478 | 3.022315 | 80.0 | 82.50 | 85.0 | 87.50 | 91.0 |
| Matt Kettmann | 1.0 | 89.000000 | NaN | 89.0 | 89.00 | 89.0 | 89.00 | 89.0 | |
| Michael Schachner | 4281.0 | 86.521607 | 2.697968 | 80.0 | 85.00 | 86.0 | 88.00 | 95.0 | |
| France | Joe Czerwinski | 1129.0 | 88.689991 | 2.743383 | 80.0 | 87.00 | 89.0 | 91.00 | 96.0 |
| Michael Schachner | 20.0 | 82.850000 | 3.869925 | 80.0 | 81.00 | 82.0 | 82.00 | 95.0 | |
| Paul Gregutt | 34.0 | 89.176471 | 2.938466 | 82.0 | 87.25 | 89.5 | 91.75 | 95.0 | |
| Roger Voss | 14395.0 | 88.671483 | 3.020005 | 80.0 | 86.00 | 88.0 | 91.00 | 100.0 | |
| Italy | Joe Czerwinski | 88.0 | 86.102273 | 2.387899 | 82.0 | 84.00 | 86.0 | 88.00 | 91.0 |
| Michael Schachner | 70.0 | 89.171429 | 3.115978 | 80.0 | 87.00 | 89.0 | 91.75 | 96.0 | |
| Paul Gregutt | 4.0 | 87.250000 | 2.500000 | 84.0 | 86.25 | 87.5 | 88.50 | 90.0 | |
| Roger Voss | 85.0 | 88.858824 | 2.376236 | 84.0 | 87.00 | 89.0 | 91.00 | 94.0 | |
| Spain | Joe Czerwinski | 1.0 | 85.000000 | NaN | 85.0 | 85.00 | 85.0 | 85.00 | 85.0 |
| Michael Schachner | 6503.0 | 87.317238 | 3.066638 | 80.0 | 85.00 | 87.0 | 89.00 | 98.0 | |
| Paul Gregutt | 4.0 | 85.750000 | 2.061553 | 84.0 | 84.00 | 85.5 | 87.25 | 88.0 | |
| US | Anna Lee C. Iijima | 2425.0 | 87.441649 | 2.097853 | 81.0 | 86.00 | 87.0 | 89.00 | 94.0 |
| Jim Gordon | 4171.0 | 88.625989 | 2.700144 | 80.0 | 87.00 | 89.0 | 91.00 | 97.0 | |
| Joe Czerwinski | 103.0 | 85.320388 | 2.804805 | 80.0 | 83.00 | 85.0 | 87.00 | 92.0 | |
| Matt Kettmann | 6236.0 | 90.034157 | 2.567008 | 81.0 | 88.00 | 90.0 | 92.00 | 97.0 | |
| Michael Schachner | 71.0 | 86.267606 | 3.480385 | 80.0 | 83.00 | 86.0 | 89.00 | 93.0 | |
| Paul Gregutt | 9271.0 | 89.079064 | 2.823103 | 80.0 | 87.00 | 89.0 | 91.00 | 100.0 | |
| Roger Voss | 2.0 | 89.500000 | 2.121320 | 88.0 | 88.75 | 89.5 | 90.25 | 91.0 | |
| Sean P. Sullivan | 4893.0 | 88.746372 | 2.463314 | 80.0 | 87.00 | 89.0 | 91.00 | 97.0 |
some reviwers do not have enough reviews in a country to look for bias, we will exclude them
include List:
Chile: Michael Schachner has done 95% of reviews for this region
France: [Joe Czerwinski, Roger Voss] Roger Voss has done 95% of reviews for this region
Italy: [Joe Czerwinski, Michael Schachner,Roger Voss]
Spain: Michael Schachner has done 99% of reviews for this country
US: [Anna Lee C. Iijima, Jim Gordon, Joe Czerwinski, Matt Kettmann, Michael Schachner, Paul Gregutt, Sean P. Sullivan]
### Italy
graphDF1 = topReviewersDF.loc[(topReviewersDF['taster_name'].isin(['Joe Czerwinski', 'Michael Schachner', 'Roger Voss'] ))&
(topReviewersDF['country'] == 'Italy')]
graphDF2 = reviewsClean.loc[reviewsClean['country'] == 'Italy']
fig = px.box(graphDF1, x = 'taster_name', y="points",
labels = {
'points' : 'Rating',
'taster_name': 'Reviewer'
},
color_discrete_sequence=px.colors.qualitative.Dark24,
title = 'Italy Wine Reviews'
)
fig2 = px.box(graphDF2, x='country', y='points',
labels = {
'country' : 'Total Reviws'
}
)
fig.add_trace(fig2.data[0])
fig.show()
Joe Czerwinski seems to be biased against itallian wines, Roger Voss' reviews seem to line up with the general itallian wine reviews, while Michael Schachner seems to have a slight bias favoring itallian wines.
Let's dig a little deeper to see if there is some selection bias happening
Let's look at Joe Czerwinski and Michael Schachner's reviews for Itally and see if they reviewed wine across multiple price buckets
graphDF1 = popReviewsCleanBuckets.loc[(popReviewsCleanBuckets['taster_name'].isin(
['Joe Czerwinski', 'Michael Schachner'])) & (popReviewsCleanBuckets['country'] == 'Italy')]
fig = px.histogram(graphDF1, x = 'price_bucket', color = 'taster_name',
category_orders = {
'price_bucket' : ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive', 'Outlandish']
},
labels = {
'price_bucket' : 'price($)' ,
'taster_name' : 'Name'
},
title = 'Price of Wine Reviewed by Reviewer and Price'
)
fig.show()
Both tasters bias' can be explained by their lop-sided selection of cheap wine in Joe Czerwinski's case, and more expensive wines in Michael Schachner's case
# US
USReviewers = ['Anna Lee C. Iijima', 'Jim Gordon',
'Joe Czerwinski', 'Matt Kettmann', 'Michael Schachner', 'Paul Gregutt', 'Sean P. Sullivan']
graphDF1 = popReviewsCleanBuckets.loc[(popReviewsCleanBuckets['taster_name'].isin(USReviewers))&
(popReviewsCleanBuckets['country'] == 'US')]
graphDF2 = reviewsClean.loc[reviewsClean['country'] == 'US']
fig = px.box(graphDF1, x = 'taster_name', y="points",
labels = {
'points' : 'Rating',
'taster_name': 'Reviewer'
},
color_discrete_sequence=px.colors.qualitative.Dark24,
title = 'US Wine Reviews'
)
fig2 = px.box(graphDF2, x='country', y='points',
labels = {
'country' : 'Total Reviws'
}
)
fig.add_trace(fig2.data[0])
fig.show()
Could be some bias, let's break it down by price bucket
graphDF1 = popReviewsCleanBuckets.loc[(popReviewsCleanBuckets['taster_name'].isin(
USReviewers)) & (popReviewsCleanBuckets['country'] == 'US')]
fig = px.histogram(graphDF1, x = 'price_bucket', color = 'taster_name',
category_orders = {
'price_bucket' : ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive', 'Outlandish']
},
labels = {
'price_bucket' : 'price($)',
'taster_name' : 'Name'
},
color_discrete_sequence=px.colors.qualitative.T10,
title = 'Number of US Wines Reviewed by Top Reviewers by Price'
)
fig.show()
On the surface, it looks like all biases can be explained by wine selection
For most popularly reviewed countries a significant proportion of the reviews are performed by one taster, making it impossible to search for significant bias in those reviews
Let's dig a little deeper and do some math to see if there is any subtle bias
Let's start with Anna Lee C. Iijima, for reviewers with a significant amount of US wine reviews (>100) she appears to be the furtherest away from the mean.
Let's sample our table with the same number of wines from each price bucket that Anna reviewed and perform a two tailed T test and see if her mean differs significantly from the total average.
We are testing the hypothesis that a reviewers mean ratings do not reflect the mean average of the total scores per country.
We will reject the null hypothesis at a p-value of 0.05.
# Obtain Anna's reviews by price bucket for the US
alciVCs = reviewsCleanBuckets.loc[(reviewsCleanBuckets['taster_name'] == 'Anna Lee C. Iijima') &
(reviewsCleanBuckets['country'] == 'US')].price_bucket.value_counts()
alciVCs
Inexpensive 941 Cheap 831 Moderate 522 Pricey 124 Expensive 7 Outlandish 0 Name: price_bucket, dtype: int64
#create a df populated with samples from total DF matching Anna's review value counts
alciDF = pd.DataFrame()
for i in range(len(alciVCs)):
toConcat = reviewsCleanBuckets.loc[(reviewsCleanBuckets['price_bucket'] == alciVCs.index[i]) &
(reviewsCleanBuckets['taster_name']!= 'Anna Lee C. Iijima') &
(reviewsCleanBuckets['country'] == 'US')].sample(alciVCs[i])
alciDF = pd.concat([alciDF, toConcat], axis = 0)
alciDF.head()
| Unnamed: 0 | country | description | designation | points | price | province | region_1 | region_2 | taster_name | taster_twitter_handle | title | variety | winery | price_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 67279 | 67279 | US | It's labeled Merlot, but the blend includes Sy... | NaN | 85 | 24.0 | Washington | Columbia Valley (WA) | Columbia Valley | Paul Gregutt | @paulgwine | Mercer 2009 Merlot (Columbia Valley (WA)) | Merlot | Mercer | Inexpensive |
| 101524 | 101524 | US | Vibrant, coppery salmon in color, this lively ... | NaN | 90 | 25.0 | Washington | Yakima Valley | Columbia Valley | Paul Gregutt | @paulgwine | Doyenne 2011 Rosé (Yakima Valley) | Rosé | Doyenne | Inexpensive |
| 25520 | 25520 | US | Fermented in stainless steel, this medium-bodi... | NaN | 86 | 24.0 | California | Alexander Valley | Sonoma | Virginie Boone | @vboone | Wattle Creek 2012 Viognier (Alexander Valley) | Viognier | Wattle Creek | Inexpensive |
| 128919 | 128919 | US | Named for Twisted Oak's infamous rubber chicke... | Ruben's Blend | 88 | 24.0 | California | Calaveras County | Sierra Foothills | Virginie Boone | @vboone | Twisted Oak 2010 Ruben's Blend White (Calavera... | Rhône-style White Blend | Twisted Oak | Inexpensive |
| 121632 | 121632 | US | Opening with aromas of baked blackberry compot... | Lubenko Vineyard | 88 | 22.0 | California | Fiddletown | Sierra Foothills | Virginie Boone | @vboone | Sobon Estate 2010 Lubenko Vineyard Zinfandel (... | Zinfandel | Sobon Estate | Inexpensive |
Let's do that T-Test
a = reviewsCleanBuckets.loc[(reviewsCleanBuckets['taster_name'] == 'Anna Lee C. Iijima') &
(reviewsCleanBuckets['country'] == 'US')]
scipy.stats.ttest_ind(a.points, alciDF.points)[1]
3.1619710265470665e-15
That's a low value...I think we can reasonably assume Anna does not have significant bias against the US, as long as our sampling method is reasonable.
That was easy, let's weaponize that logic. Let's use this method to find any potential bias for the top 10 reviewers in the top 10 countries for reviewers with over 100 ratings in that country.
topReviwersList = ['Roger Voss', 'Michael Schachner', 'Kerin O’Keefe, Virginie Boone', 'Paul Gregutt',
'Matt Kettmann', 'Joe Czerwinski', 'Sean P. Sullivan', 'Anna Lee C. Iijima', 'Jim Gordon']
reviewCountries = reviewsCleanBuckets.country.value_counts().index[:10]
tTestResultsDF = pd.DataFrame(columns = ['reviewer', 'country', 'p-value', 'Number of Reviews', 'Percentage of Total'])
sampDF = pd.DataFrame()
for name in topReviwersList:
for country in reviewCountries:
# check if reviewer has a significant number of reviews in the current country
if len(reviewsCleanBuckets.loc[(reviewsCleanBuckets['country'] == country)&(reviewsCleanBuckets['taster_name'] == name)]) > 100:
#obtain 'names' review count by price bucket for 'country'
VCs = reviewsCleanBuckets.loc[(reviewsCleanBuckets['taster_name'] == name) &
(reviewsCleanBuckets['country'] == country)].price_bucket.value_counts()
# loop over the value count list and create sample dataframe with the same characteristics as the reviewer and country counts
for i in range(len(VCs)):
toConcat = reviewsCleanBuckets.loc[(reviewsCleanBuckets['price_bucket'] == VCs.index[i]) &
(reviewsCleanBuckets['country'] == country)].sample(VCs[i])
sampDF = pd.concat([sampDF, toConcat], axis = 0)
# perform T-Test
a = reviewsCleanBuckets.loc[(reviewsCleanBuckets['taster_name'] == name) &
(reviewsCleanBuckets['country'] == country)]
# store the p value in p
p = scipy.stats.ttest_ind(a.points, sampDF.points)[1]
numReviews = len(reviewsCleanBuckets.loc[(reviewsCleanBuckets['country'] == country)&(reviewsCleanBuckets['taster_name'] == name)])
perReviews = numReviews / len(reviewsCleanBuckets.loc[reviewsCleanBuckets['country'] == country])
# create dictionary for appending
appDict = {'reviewer': name, 'country': country, 'p-value': p, 'Number of Reviews' : numReviews, 'Percentage of Total': round(perReviews * 100,2)}
# store results in DF
tTestResultsDF = tTestResultsDF.append(appDict, ignore_index = True)
tTestResultsDF
| reviewer | country | p-value | Number of Reviews | Percentage of Total | |
|---|---|---|---|---|---|
| 0 | Roger Voss | France | 1.544395e-04 | 14395 | 82.14 |
| 1 | Roger Voss | Portugal | 2.271643e-13 | 4842 | 99.43 |
| 2 | Roger Voss | Austria | 2.038144e-03 | 831 | 29.77 |
| 3 | Michael Schachner | Spain | 1.606672e-139 | 6503 | 99.91 |
| 4 | Michael Schachner | Chile | 3.861679e-225 | 4281 | 99.44 |
| 5 | Michael Schachner | Argentina | 4.693461e-121 | 3753 | 100.00 |
| 6 | Paul Gregutt | US | 7.426270e-149 | 9271 | 24.72 |
| 7 | Matt Kettmann | US | 0.000000e+00 | 6236 | 16.62 |
| 8 | Joe Czerwinski | US | 1.344610e-22 | 103 | 0.27 |
| 9 | Joe Czerwinski | France | 5.367604e-05 | 1129 | 6.44 |
| 10 | Joe Czerwinski | Germany | 3.672109e-08 | 363 | 17.34 |
| 11 | Joe Czerwinski | Australia | 7.648167e-11 | 2006 | 99.95 |
| 12 | Sean P. Sullivan | US | 2.501165e-15 | 4893 | 13.04 |
| 13 | Anna Lee C. Iijima | US | 2.475883e-49 | 2425 | 6.46 |
| 14 | Anna Lee C. Iijima | Germany | 1.415996e-98 | 1730 | 82.66 |
| 15 | Jim Gordon | US | 1.618293e-06 | 4171 | 11.12 |
Conclusion I hae to say I am impressed. Wine Spectator reviewers are remarkabley consistant measured by this method. I feel confident there is no obvious bias from one reviewer over another by region.
I will take the 7 most common pairing suggestions and try to find a region for the top 5 price buckets that have the highest mean rating per dollar spent
Start by looking at the varieties of wine with more than 200 entries (need a reasonably sized list, and large sample size)
vc = reviewsCleanBuckets['variety'].value_counts()
varList = list(vc[vc > 200].index)
# remove list entries that are too broad a category, or desert wines (only doing parings with mains)
# remove pinot grigio, because no one should have to drink that
removeList = ['Red Blend', 'Portuguese Red', 'Port', 'Melon', 'Sauvignon', 'Pinot Grigio']
varList = [e for e in varList if e not in removeList]
# filter the DF to values only in this list
topVarDF = reviewsCleanBuckets.loc[reviewsCleanBuckets['variety'].isin(varList)].reset_index()
topVarDF.drop(['index', 'Unnamed: 0'], axis = 1, inplace= True)
I will use my own knowledge of the subject of wine pairing to create a dictionary of pairing classification and appropriate wines to serve.
# create categories for pairing
pairingsDict = {'fish' : ['Sauvignon Blanc', 'Pinot Gris', 'White Blend', 'Chenin Blanc', 'Albariño', 'Pinot Blanc'],
'red_meat' : ['Cabernet Sauvignon', 'Bordeaux-style Red Blend', 'Nebbiolo','Rhône-style Red Blend', 'Cabernet Franc', 'Barbera','Verdejo', 'Petit Verdot'],
'salty' : ['Rosé', 'Sparkling Blend', 'Champagne Blend', 'Glera'],
'spicey' : ['Syrah', 'Malbec', 'Tempranillo', 'Gamay','Shiraz', 'Tempranillo Blend', 'Grenache' ,'Petite Sirah', 'Garnacha'],
'rich' : ['Pinot Noir', 'Merlot', 'Sangiovese', 'Zinfandel', 'Carmenère','Torrontés' ],
'pork' : ['Riesling','Grüner Veltliner', 'Gewürztraminer','Blaufränkisch'],
'chicken' : ['Chardonnay', 'Portuguese White', 'Viognier', 'Bordeaux-style White Blend', 'Rhône-style White Blend']
}
# Let's try using a loop to get a series to concat to the DF
pairings = []
for i in topVarDF.variety:
for j in pairingsDict:
if i in pairingsDict[j]:
pairings.append(j)
pairings = pd.Series(pairings)
topVarDF = pd.concat([topVarDF, pairings], axis = 1)
topVarDF.columns = ['country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2',
'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery', 'price_bucket', 'pairing']
# Fill region 1 column NaNs with none, for easier manipulation
topVarDF.region_1.fillna('None')
# filter DF to regions with at least 5 reviews
topVarDF = topVarDF[topVarDF['region_1'].map(topVarDF['region_1'].value_counts()) > 5]
# sort by review score
topVarDF = topVarDF.sort_values('points')
#Create a grouped dataframe aggregating price, points, and count.
bigGroup = topVarDF.groupby(['pairing', 'price_bucket','country', 'region_1']).agg(
variety = ('variety', 'first'),
points_average = ('points', 'mean'),
points_std = ('points', 'std'),
price_avg = ('price', 'mean'),
price_std = ('price', 'std'),
count = ('points', 'count'),
).dropna()
bigGroup.head()
| variety | points_average | points_std | price_avg | price_std | count | ||||
|---|---|---|---|---|---|---|---|---|---|
| pairing | price_bucket | country | region_1 | ||||||
| chicken | Cheap | Argentina | Alto Valle del Río Negro | Chardonnay | 85.857143 | 0.899735 | 17.000000 | 0.000000 | 7 |
| Argentina | Chardonnay | 84.000000 | 0.000000 | 11.000000 | 2.828427 | 2 | |||
| Calchaquí Valley | Chardonnay | 83.400000 | 1.140175 | 14.000000 | 2.738613 | 5 | |||
| Luján de Cuyo | Chardonnay | 85.000000 | 1.732051 | 10.666667 | 0.577350 | 3 | |||
| Maipú | Chardonnay | 83.500000 | 2.121320 | 14.000000 | 0.000000 | 2 |
# Create a value column
bigGroup['value'] = bigGroup['points_average'] / bigGroup['price_avg']
Take it for a spin. Let's look at the top 5 value pairings for Fish and Inexpensive wines
bigGroup.loc[('fish', 'Inexpensive')].sort_values('value').tail(5)
| variety | points_average | points_std | price_avg | price_std | count | value | ||
|---|---|---|---|---|---|---|---|---|
| country | region_1 | |||||||
| France | Limoux | Chenin Blanc | 88.250 | 2.217356 | 18.5 | 1.000000 | 4 | 4.770270 |
| US | Shenandoah Valley (CA) | Sauvignon Blanc | 86.000 | 0.000000 | 18.0 | 0.000000 | 2 | 4.777778 |
| Snipes Mountain | Chenin Blanc | 88.500 | 4.949747 | 18.5 | 0.707107 | 2 | 4.783784 | |
| Potter Valley | Sauvignon Blanc | 87.125 | 0.834523 | 18.0 | 0.000000 | 8 | 4.840278 | |
| France | Rivesaltes | White Blend | 89.000 | 0.000000 | 18.0 | 0.000000 | 2 | 4.944444 |
Let's make a value dataframe with all permutations of pairings and price bucket
Note: Had to remove Outlandish price bucket, there are not enough reviews for certain pairings to create meaningful results
valueDF = pd.DataFrame()
pairingsList = ['fish', 'red_meat', 'salty', 'spicey', 'rich', 'pork', 'chicken']
bucketList = ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive']
bigGroupReset = bigGroup.reset_index()
for i in pairingsList:
for j in bucketList:
valueDF = pd.concat([valueDF, bigGroupReset.loc[((bigGroupReset['pairing'] == i)
& (bigGroupReset['price_bucket'] == j))].sort_values('value').tail(1)], axis = 0)
valueDF = valueDF.sort_values(['pairing', 'price_bucket'])
valueDF
| pairing | price_bucket | country | region_1 | variety | points_average | points_std | price_avg | price_std | count | value | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 72 | chicken | Cheap | Spain | Campo de Borja | Chardonnay | 83.500000 | 0.707107 | 8.500000 | 0.707107 | 2 | 9.823529 |
| 190 | chicken | Inexpensive | Italy | Alto Adige | Chardonnay | 87.333333 | 1.154701 | 18.000000 | 0.000000 | 3 | 4.851852 |
| 352 | chicken | Moderate | Spain | Cava | Chardonnay | 90.500000 | 0.707107 | 26.500000 | 0.707107 | 2 | 3.415094 |
| 440 | chicken | Pricey | Australia | Eden Valley | Viognier | 92.500000 | 0.707107 | 45.000000 | 0.000000 | 2 | 2.055556 |
| 560 | chicken | Expensive | US | Fort Ross-Seaview | Chardonnay | 95.666667 | 2.081666 | 80.000000 | 0.000000 | 3 | 1.195833 |
| 635 | fish | Cheap | Spain | Catalunya | White Blend | 84.500000 | 1.732051 | 9.500000 | 2.516611 | 4 | 8.894737 |
| 736 | fish | Inexpensive | France | Rivesaltes | White Blend | 89.000000 | 0.000000 | 18.000000 | 0.000000 | 2 | 4.944444 |
| 941 | fish | Moderate | US | Sonoma Mountain | Sauvignon Blanc | 90.500000 | 0.707107 | 26.000000 | 0.000000 | 2 | 3.480769 |
| 960 | fish | Pricey | France | Savennières | Chenin Blanc | 92.500000 | 0.707107 | 45.000000 | 0.000000 | 2 | 2.055556 |
| 988 | fish | Expensive | France | Quarts de Chaume | Chenin Blanc | 93.500000 | 2.121320 | 82.000000 | 2.828427 | 2 | 1.140244 |
| 1020 | pork | Cheap | US | Idaho | Riesling | 86.666667 | 2.338090 | 9.000000 | 3.033150 | 6 | 9.629630 |
| 1069 | pork | Inexpensive | US | Columbia Valley (OR) | Riesling | 88.500000 | 0.707107 | 18.000000 | 0.000000 | 2 | 4.916667 |
| 1115 | pork | Moderate | Canada | Beamsville Bench | Riesling | 90.666667 | 1.154701 | 26.666667 | 1.154701 | 3 | 3.400000 |
| 1151 | pork | Pricey | Australia | Eden Valley | Riesling | 92.500000 | 0.707107 | 44.000000 | 1.414214 | 2 | 2.102273 |
| 1159 | pork | Expensive | Canada | Niagara Peninsula | Riesling | 91.750000 | 1.500000 | 82.500000 | 5.000000 | 4 | 1.112121 |
| 1222 | red_meat | Cheap | France | Languedoc | Cabernet Sauvignon | 81.500000 | 0.707107 | 8.500000 | 0.707107 | 2 | 9.588235 |
| 1388 | red_meat | Inexpensive | Italy | Trentino | Cabernet Sauvignon | 88.000000 | 0.000000 | 18.000000 | 0.000000 | 2 | 4.888889 |
| 1534 | red_meat | Moderate | France | Ventoux | Rhône-style Red Blend | 92.333333 | 0.577350 | 28.666667 | 2.309401 | 3 | 3.220930 |
| 1677 | red_meat | Pricey | France | Lirac | Rhône-style Red Blend | 92.000000 | 0.816497 | 43.500000 | 1.000000 | 4 | 2.114943 |
| 1798 | red_meat | Expensive | Italy | Barbera d'Asti | Barbera | 89.000000 | 1.414214 | 82.500000 | 3.535534 | 2 | 1.078788 |
| 1930 | rich | Cheap | US | Idaho | Merlot | 87.500000 | 0.707107 | 8.500000 | 2.121320 | 2 | 10.294118 |
| 2080 | rich | Inexpensive | US | Santa Margarita Ranch | Zinfandel | 88.750000 | 2.061553 | 18.000000 | 0.000000 | 4 | 4.930556 |
| 2199 | rich | Moderate | US | Hudson River Region | Pinot Noir | 88.000000 | 0.000000 | 27.500000 | 2.121320 | 2 | 3.200000 |
| 2354 | rich | Pricey | US | Monterey County | Pinot Noir | 92.166667 | 1.169045 | 44.333333 | 0.816497 | 6 | 2.078947 |
| 2428 | rich | Expensive | US | California | Pinot Noir | 91.333333 | 1.211060 | 81.500000 | 2.738613 | 6 | 1.120654 |
| 2540 | salty | Cheap | Spain | La Mancha | Rosé | 84.000000 | 1.414214 | 7.000000 | 0.000000 | 2 | 12.000000 |
| 2627 | salty | Inexpensive | France | Touraine | Sparkling Blend | 88.333333 | 0.577350 | 18.000000 | 0.000000 | 3 | 4.907407 |
| 2747 | salty | Moderate | US | Mendocino | Rosé | 91.000000 | 1.414214 | 26.000000 | 0.000000 | 2 | 3.500000 |
| 2793 | salty | Pricey | US | North Coast | Sparkling Blend | 91.800000 | 0.836660 | 43.800000 | 0.836660 | 5 | 2.095890 |
| 2807 | salty | Expensive | US | Carneros | Sparkling Blend | 93.500000 | 0.707107 | 80.000000 | 0.000000 | 2 | 1.168750 |
| 2822 | spicey | Cheap | Argentina | Tulum Valley | Malbec | 85.000000 | 2.449490 | 9.000000 | 0.000000 | 5 | 9.444444 |
| 2946 | spicey | Inexpensive | Australia | Western Australia | Shiraz | 89.000000 | 1.224745 | 18.600000 | 0.894427 | 5 | 4.784946 |
| 3088 | spicey | Moderate | France | Rivesaltes | Grenache | 89.000000 | 0.000000 | 26.000000 | 0.000000 | 2 | 3.423077 |
| 3284 | spicey | Pricey | US | Sonoma Mountain | Grenache | 91.000000 | 0.000000 | 44.000000 | 0.000000 | 2 | 2.068182 |
| 3321 | spicey | Expensive | France | Moulin-à-Vent | Gamay | 93.000000 | 0.000000 | 80.000000 | 0.000000 | 2 | 1.162500 |
Let's try to visualize this data
valueDF['location'] = valueDF['country'] + ', ' + valueDF['region_1']
fig = px.scatter(valueDF, y = 'location', x = 'variety', color = 'pairing', facet_col = 'price_bucket',
size = 'value',
labels = {
'location': 'Region',
'pairing' : '',
'price_bucket' : 'Price',
'variety' : 'Varietal',
},
title = 'Best Wine Value for Pairings (rating per dollar) by Location <br><sup>size of marker is relative value</sup>',
color_discrete_sequence=px.colors.qualitative.T10,
height = 900,
width = 1000)
fig.update_xaxes(matches=None)
fig.update_layout(title_x=0.5, title_y=.96,
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
font=dict(
size=16
)
))
fig.update_traces(marker=dict(line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')
fig
Lets try to visualize how often these countries show up in our value dataframe
fig = px.bar(valueDF, x = 'country', color = 'price_bucket',
labels = {
'price_bucket' : 'Price',
'country' : 'Country'
},
color_discrete_sequence=px.colors.qualitative.T10,
title = 'Country Frequency in Value Table by Price'
)
fig
Conclusion: This table gives us a good idea about what countries to consider purchasing a bottle from if we are searching for high value wine
We just discovered the United States is the country with the highest number of entries in our value dataframe. Lets try to find which specific regions and varietals over perform.
Filter our dataframe for wines from the United States
USWines = reviewsCleanBuckets.loc[reviewsClean['country'] == 'US'].drop('Unnamed: 0', axis = 1)
USWines.head()
| country | description | designation | points | price | province | region_1 | region_2 | taster_name | taster_twitter_handle | title | variety | winery | price_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | US | Tart and snappy, the flavors of lime flesh and... | NaN | 87 | 14.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Rainstorm 2013 Pinot Gris (Willamette Valley) | Pinot Gris | Rainstorm | Cheap |
| 3 | US | Pineapple rind, lemon pith and orange blossom ... | Reserve Late Harvest | 87 | 13.0 | Michigan | Lake Michigan Shore | NaN | Alexander Peartree | NaN | St. Julian 2013 Reserve Late Harvest Riesling ... | Riesling | St. Julian | Cheap |
| 4 | US | Much like the regular bottling from 2012, this... | Vintner's Reserve Wild Child Block | 87 | 65.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Sweet Cheeks 2012 Vintner's Reserve Wild Child... | Pinot Noir | Sweet Cheeks | Pricey |
| 10 | US | Soft, supple plum envelopes an oaky structure ... | Mountain Cuvée | 87 | 19.0 | California | Napa Valley | Napa | Virginie Boone | @vboone | Kirkland Signature 2011 Mountain Cuvée Caberne... | Cabernet Sauvignon | Kirkland Signature | Inexpensive |
| 12 | US | Slightly reduced, this wine offers a chalky, t... | NaN | 87 | 34.0 | California | Alexander Valley | Sonoma | Virginie Boone | @vboone | Louis M. Martini 2012 Cabernet Sauvignon (Alex... | Cabernet Sauvignon | Louis M. Martini | Moderate |
Let's further filter the list to varietals with over 200 reviews for better visualization and statistical treatment
topUSVarsDF = USWines[USWines['variety'].map(USWines['variety'].value_counts()) > 200]
topUSVarsDF.price_bucket.fillna('Cheap', inplace = True)
topUSVarsList = list(topUSVarsDF.variety)
C:\Users\jmeis\anaconda3\lib\site-packages\pandas\core\generic.py:6392: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Lets see how US vatietals perform against the rest of the world in each price bucket
#filter full DF by varietal list
DFGraph2 = reviewsCleanBuckets.loc[reviewsCleanBuckets['variety'].isin(topUSVarsList)]
DFGraph1 = topUSVarsDF.loc[topUSVarsDF['price_bucket'] == 'Cheap']
DFGraph2 = reviewsCleanBuckets.loc[(reviewsCleanBuckets['variety'].isin(topUSVarsList)) &(
reviewsCleanBuckets['price_bucket'] == 'Cheap')]
trace0 = go.Box(x = DFGraph1['variety'], y=DFGraph1["points"], name = 'US Wines')
trace1 = go.Box(x=DFGraph2['variety'], y= DFGraph2['points'], name = 'World Wines')
data = [trace0, trace1]
layout = go.Layout(title = 'Most Reviewed US wine varietals vs Review Scores for "Cheap Wines"', xaxis = {'title': 'Varietal'}, yaxis = {'title' : 'Score'})
fig1 = go.Figure(data = data, layout = layout)
DFGraph1 = topUSVarsDF.loc[topUSVarsDF['price_bucket'] == 'Inexpensive']
DFGraph2 = reviewsCleanBuckets.loc[(reviewsCleanBuckets['variety'].isin(topUSVarsList)) &(
reviewsCleanBuckets['price_bucket'] == 'Inexpensive')]
trace0 = go.Box(x = DFGraph1['variety'], y=DFGraph1["points"], name = 'US Wines')
trace1 = go.Box(x=DFGraph2['variety'], y= DFGraph2['points'], name = 'World Wines')
data = [trace0, trace1]
layout = go.Layout(title = 'Most Reviewed US wine varietals vs Review Scores for "Inexpensive" Wines', xaxis = {'title': 'Varietal'}, yaxis = {'title' : 'Score'})
fig2 = go.Figure(data = data, layout = layout)
DFGraph1 = topUSVarsDF.loc[topUSVarsDF['price_bucket'] == 'Moderate']
DFGraph2 = reviewsCleanBuckets.loc[(reviewsCleanBuckets['variety'].isin(topUSVarsList)) &(
reviewsCleanBuckets['price_bucket'] == 'Moderate')]
trace0 = go.Box(x = DFGraph1['variety'], y=DFGraph1["points"], name = 'US Wines')
trace1 = go.Box(x=DFGraph2['variety'], y= DFGraph2['points'], name = 'World Wines')
data = [trace0, trace1]
layout = go.Layout(title = 'Most Reviewed US wine varietals vs Review Scores for "Moderate" Wines', xaxis = {'title': 'Varietal'}, yaxis = {'title' : 'Score'})
fig3 = go.Figure(data = data, layout = layout)
DFGraph1 = topUSVarsDF.loc[topUSVarsDF['price_bucket'] == 'Expensive']
DFGraph2 = reviewsCleanBuckets.loc[(reviewsCleanBuckets['variety'].isin(topUSVarsList)) &(
reviewsCleanBuckets['price_bucket'] == 'Expensive')]
trace0 = go.Box(x = DFGraph1['variety'], y=DFGraph1["points"], name = 'US Wines')
trace1 = go.Box(x=DFGraph2['variety'], y= DFGraph2['points'], name = 'World Wines')
data = [trace0, trace1]
layout = go.Layout(title = 'Most Reviewed US wine varietals vs Review Scores for "Expensive" Wines', xaxis = {'title': 'Varietal', 'tickangle':90}, yaxis = {'title' : 'Score'})
fig4 = go.Figure(data = data, layout = layout)
DFGraph1 = topUSVarsDF.loc[topUSVarsDF['price_bucket'] == 'Outlandish']
DFGraph2 = reviewsCleanBuckets.loc[(reviewsCleanBuckets['variety'].isin(topUSVarsList)) &(
reviewsCleanBuckets['price_bucket'] == 'Outlandish')]
trace0 = go.Box(x = DFGraph1['variety'], y=DFGraph1["points"], name = 'US Wines')
trace1 = go.Box(x=DFGraph2['variety'], y= DFGraph2['points'], name = 'World Wines')
data = [trace0, trace1]
layout = go.Layout(title = 'Most Reviewed US wine varietals vs Review Scores for "Outlandish" Wines', xaxis = {'title': 'Varietal', 'tickangle': 90}, yaxis = {'title' : 'Score'})
fig5 = go.Figure(data = data, layout = layout)
There is a lot of data here, I decided to break it down into 5 graphs by price bucket.
fig1
fig2
fig3
fig4
fig5
IT seems like it would be very difficult to off set one plot from another, let's try a numerical approach to find US wine varietals that score on average better than the rest of the world by price bucket
# I want to group by varietal and countries (US and not US) might be easiest to make a new column to groupby
worldVars = reviewsCleanBuckets.loc[reviewsCleanBuckets['variety'].isin(topUSVarsList)].drop('Unnamed: 0', axis =1).reset_index(drop=True)
worldVars = pd.concat([worldVars, pd.Series(np.where(worldVars['country'] == 'US', 'US', 'world'))], axis = 1)
worldVars.rename(columns = {0 : 'is_US'}, inplace=True)
worldVarsGroup = worldVars.groupby(['price_bucket','is_US', 'variety']).agg(
points_mean = ('points', 'mean'),).fillna(0)
worldVarsGroup
| points_mean | |||
|---|---|---|---|
| price_bucket | is_US | variety | |
| Cheap | US | Barbera | 86.263158 |
| Bordeaux-style Red Blend | 86.666667 | ||
| Cabernet Franc | 86.785714 | ||
| Cabernet Sauvignon | 86.286638 | ||
| Chardonnay | 86.369447 | ||
| ... | ... | ... | ... |
| Outlandish | world | Syrah | 91.888889 |
| Tempranillo | 94.083333 | ||
| Viognier | 0.000000 | ||
| White Blend | 0.000000 | ||
| Zinfandel | 0.000000 |
300 rows × 1 columns
Need to split groupby into to dataframes of US and world, then join on price bucket and variety
worldVarsGroup.reset_index(inplace = True)
US = worldVarsGroup.loc[worldVarsGroup['is_US'] == 'US']
world = worldVarsGroup.loc[worldVarsGroup['is_US'] != 'US']
# rename columns
US.columns = ['price_bucket', 'is_US', 'US_variety', 'US_points_mean']
world.columns = ['price_bucket', 'is_US', 'world_variety', 'world_points_mean']
# merge
pointsCompare = pd.merge(US, world, how='left', left_on=['price_bucket','US_variety'], right_on = ['price_bucket','world_variety'])
pointsCompare = pointsCompare.drop(['is_US_y', 'world_variety'], axis = 1).rename(columns = {'is_US_x':'is_US', 'US_variety':'variety'})
#Reminder: we want to find wines where the Us outperforms the world
#so lets make a new column with the difference of the two scores
pointsCompare['difference'] = pointsCompare['US_points_mean'] - pointsCompare['world_points_mean']
pointsCompare
| price_bucket | is_US | variety | US_points_mean | world_points_mean | difference | |
|---|---|---|---|---|---|---|
| 0 | Cheap | US | Barbera | 86.263158 | 87.265306 | -1.002148 |
| 1 | Cheap | US | Bordeaux-style Red Blend | 86.666667 | 86.394584 | 0.272083 |
| 2 | Cheap | US | Cabernet Franc | 86.785714 | 86.615385 | 0.170330 |
| 3 | Cheap | US | Cabernet Sauvignon | 86.286638 | 85.554695 | 0.731943 |
| 4 | Cheap | US | Chardonnay | 86.369447 | 85.188552 | 1.180895 |
| ... | ... | ... | ... | ... | ... | ... |
| 145 | Outlandish | US | Syrah | 94.333333 | 91.888889 | 2.444444 |
| 146 | Outlandish | US | Tempranillo | 0.000000 | 94.083333 | -94.083333 |
| 147 | Outlandish | US | Viognier | 0.000000 | 0.000000 | 0.000000 |
| 148 | Outlandish | US | White Blend | 0.000000 | 0.000000 | 0.000000 |
| 149 | Outlandish | US | Zinfandel | 0.000000 | 0.000000 | 0.000000 |
150 rows × 6 columns
lets only take rows with a value greater than zero (varieties where the US performs better)
pointsComparePos = pointsCompare.loc[(pointsCompare['difference'] > 0) & (pointsCompare['difference'] < 10)]
fig = px.bar(pointsComparePos, x = 'variety', y = 'difference', facet_row = 'price_bucket', height = 1000,
labels = {
'price_bucket' : 'Price' ,
'difference' : 'Score Differential',
'variety': 'Varietal'
},
color_discrete_sequence=px.colors.qualitative.Dark24,
title = 'Varietals Where US Scores Higher Than the Rest of the Eorld'
)
fig.update_yaxes(matches=None)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig
This is a little difficult to read, let's combine with some color!
fig = px.bar(pointsComparePos, x = 'variety', y = 'difference', color = 'price_bucket',
labels = {
'price_bucket' : 'Price',
'variety' : 'Varietal',
'difference' : 'Average Review Score Differential'
},
color_discrete_sequence=px.colors.qualitative.T10,
title = 'Varietals Where US Scores Higher Than the Rest of the World'
)
fig
The Us seems to perform the most consistantly on the 'cheap' wines. US 'Expensive' wines, when they do perform better than the world average, are significantly better.
Let's find the regions where the US is making this great wine
# group by price_bucket sort by difference and take the top value
bestUS = pointsComparePos.sort_values('difference', ascending = False).groupby('price_bucket').first()
bestUS
# define lists of varietals and buckets from DF
tempList = ['Grenache', 'Grenache', 'White Blend', 'Pinot Grigio', 'Grenache', 'Red Blend']
bucketList = ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive', 'Outlandish']
# new DF is the filtered topUSVars DF filtered by the tuple combination of above lists
bestUSComplete = topUSVarsDF.loc[topUSVarsDF[['variety', 'price_bucket']].apply(tuple, axis=1).isin(list(zip(tempList, bucketList)))]
#groupby multiple columns, aggregate count
smallGroup = bestUSComplete.groupby(['price_bucket','province', 'variety']).agg(
province_count = ('province', 'count')).reset_index()
smallGroup = smallGroup.loc[smallGroup['province_count'] != 0]
fig = px.bar(smallGroup, x = 'province', y = 'province_count', color = 'variety', facet_row = 'price_bucket', height = 1000,
labels = {
'province_count' : '# of Wines',
'province' : 'State',
'variety':'Varital'
},
color_discrete_sequence=px.colors.qualitative.T10,
title = 'Number of Reviewed Wines in Each State and Price Bucket That Perform Better Than the World Average' )
fig.update_yaxes(matches=None)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig
Conclusion: If you are looking for wine that scores higher than the world average, buy California!
We've spent a lot of time looking at countries known for their wine production, let's look at some lesser known areas
smallCountriesDF = reviewsCleanBuckets.loc[(reviewsCleanBuckets['country'].map(reviewsCleanBuckets['country'].value_counts() > 100)) & (reviewsCleanBuckets['country'].map(reviewsCleanBuckets['country'].value_counts() < 1000))]
smallCountriesDF.country.value_counts()
Israel 484 Greece 461 Canada 253 Hungary 144 Bulgaria 141 Romania 120 Uruguay 109 Name: country, dtype: int64
Filtering our dataframe to countries with more than 100 reviews and less than 1000 gives us a nice little sample of 7 countries to look for quality wine
Let's see what kind of wine is being made in these countries
fig = px.histogram(smallCountriesDF, x = 'variety', color = 'country',
title = 'Varietals of wine made by small production countries'
)
fig
As expected, a lot of varietals with only a couple of reviews, let's filter again for varietals with more than 10 reviews.
popSmallCountriesDF = smallCountriesDF.loc[smallCountriesDF['variety'].map(smallCountriesDF['variety'].value_counts() > 10)]
fig = px.histogram(popSmallCountriesDF, x = 'variety', color = 'country',
title = 'Varietals of wine made by small production countries'
)
fig
We can work with this
Lets find varieties of wine that are made in these countries and the rest of the world in statistiscally significant quantities and compare ratings.
#make an array of varieties produced in these countries
smallVariety = popSmallCountriesDF.variety.unique()
#filter our complete reviews by this list and a value count greater than 100
fullSmallVarDF = reviewsCleanBuckets.loc[(reviewsCleanBuckets['variety'].isin(smallVariety)) &
(reviewsCleanBuckets['variety'].map(reviewsCleanBuckets['variety'].value_counts() > 100))]
In our early analysis we discovered that out of the 5 most reviewed countries italy appeared to score the lowest, lets look further into that.
italyDF = reviewsCleanBuckets.copy()
italyDF.price_bucket.fillna('Cheap', inplace = True)
italyDF.reset_index(drop = True, inplace = True)
italyDF = italyDF[italyDF['country'].notna()]
italyDF.reset_index(drop = True, inplace = True)
italyDF = pd.concat([italyDF, pd.Series(np.where(italyDF['country'] == 'Italy', 'Italy', 'world'))], axis = 1)
italyDF.rename(columns = {0:'is_italy'}, inplace = True)
fig = px.box(italyDF, x="price_bucket", y="points", color = 'is_italy',
category_orders = {
'price_bucket' : ['Cheap', 'Inexpensive', 'Moderate', 'Pricey', 'Expensive', 'Outlandish']
},
labels = {
'price_bucket' : 'price($)',
'points' : 'rating'
},
title = 'Comparison of Review scores of Italy versus the rest of the world'
)
fig.update_layout(legend_title = 'Country')
fig.show()
Alright, it's not just one price bucket...are there certain regions or varietals pulling the review scores down?
#Create working DF
onlyItalyDF = italyDF.loc[italyDF['country'] == 'Italy']
# group by price and province, aggregate points
dfh = onlyItalyDF.groupby(['price_bucket', 'province']).agg(
average_points = ('points', 'mean')
).dropna()
#collapse index, get the top 3 for each group
h = dfh.groupby(level=0, group_keys=False).apply(
lambda x: x.sort_values(('average_points'),).head(3)).reset_index()
h = h.merge(italyDF.groupby('price_bucket')['points'].agg('mean'), how = 'inner', on = 'price_bucket')
# create difference column (world average minus itally average)
h['difference'] = h['points'] - h['average_points']
h.sort_values('difference', ascending = False).head(10)
| price_bucket | province | average_points | points | difference | |
|---|---|---|---|---|---|
| 15 | Outlandish | Veneto | 89.777778 | 93.958603 | 4.180825 |
| 16 | Outlandish | Sicily & Sardinia | 90.000000 | 93.958603 | 3.958603 |
| 6 | Moderate | Northwestern Italy | 88.000000 | 89.261894 | 1.261894 |
| 9 | Pricey | Lombardy | 89.490196 | 90.694347 | 1.204151 |
| 12 | Expensive | Northeastern Italy | 90.863636 | 91.929532 | 1.065895 |
| 3 | Inexpensive | Italy Other | 87.041667 | 88.047778 | 1.006112 |
| 10 | Pricey | Southern Italy | 89.734177 | 90.694347 | 0.960170 |
| 17 | Outlandish | Italy Other | 93.000000 | 93.958603 | 0.958603 |
| 7 | Moderate | Central Italy | 88.306452 | 89.261894 | 0.955442 |
| 11 | Pricey | Veneto | 89.763889 | 90.694347 | 0.930458 |
fig = px.bar(h.loc[h['price_bucket']!= 'Outlandish'].sort_values('difference', ascending = False).head(10), x = 'province', y = 'difference', color = 'price_bucket',
labels = {
'difference' : 'Review Score Differential (# Points)',
'province' : 'Region'
},
color_discrete_sequence=px.colors.qualitative.T10,
title = 'Italian Wine Difference Form World Mean by Region and Price')
fig.update_layout(legend_title = 'Price')
fig
Conclusion: These are the regions and associated prices to avoid when purchasing itallian wine.
Let's look at varietals now.
#group by varietal and price
dfg = onlyItalyDF.sort_values('points').groupby(['price_bucket', 'variety']).agg(
average_points = ('points', 'mean'),
count = ('variety', 'count')
).dropna()
# take the 5 worst rated varietals for each price bucket
g = dfg.loc[dfg['count']> 5].groupby(level=0, group_keys=False).apply(
lambda x: x.sort_values(('average_points'),).head(5)).reset_index()
#compare to world average by price bucket
g = g.merge(italyDF.groupby('price_bucket')['points'].agg('mean'), how = 'inner', on = 'price_bucket')
g['difference'] = g['points'] - g['average_points']
g.sort_values('difference', ascending = False).head(10).sort_values('price_bucket')
| price_bucket | variety | average_points | count | points | difference | |
|---|---|---|---|---|---|---|
| 10 | Moderate | Montepulciano | 86.285714 | 14 | 89.261894 | 2.976179 |
| 11 | Moderate | Frappato | 87.142857 | 7 | 89.261894 | 2.119036 |
| 12 | Moderate | Primitivo | 87.750000 | 8 | 89.261894 | 1.511894 |
| 15 | Pricey | Montepulciano | 87.285714 | 7 | 90.694347 | 3.408633 |
| 16 | Pricey | Barbera | 88.533333 | 15 | 90.694347 | 2.161014 |
| 17 | Pricey | Merlot | 88.600000 | 15 | 90.694347 | 2.094347 |
| 18 | Pricey | Garganega | 88.833333 | 6 | 90.694347 | 1.861014 |
| 19 | Pricey | Chardonnay | 88.944444 | 36 | 90.694347 | 1.749903 |
| 20 | Expensive | Merlot | 89.750000 | 8 | 91.929532 | 2.179532 |
| 21 | Expensive | Sparkling Blend | 90.285714 | 7 | 91.929532 | 1.643817 |
fig = px.bar(g.loc[g['price_bucket']!= 'Outlandish'].sort_values('difference', ascending = False).head(30),
x = 'variety', y = 'difference', color = 'price_bucket',
labels = {
'difference' : 'Review Score Differential (# Points)',
'variety' : 'Varietal'
},
color_discrete_sequence=px.colors.qualitative.T10,
title = 'Italian Wine Difference From World Mean by Varietal and Price')
fig.update_layout(legend_title = 'Price')
fig
Conclusion: Don't buy Itallian Montepulciano, and in general avoid grapes of French origin for 'Pricey' wines.